experiment_name <- "fveg_1-24_gb"
experiment_path <- here("data", "4_for_analysis", "ML_outputs", "experiments", experiment_name)
dataset <- "fveg"
months_ts <- TRUE
# read in the fallow fields predictions
fallow <- fread(here(experiment_path, "fallow.csv"))
fallow[fallow == -9999] <- NA # this is the na value
if (months_ts){
# get a nice column of numeric months
months <- select(fallow, names(fallow)[grepl("month", names(fallow))])
fallow$month <- names(months)[max.col(months)]
fallow$month <- str_extract(fallow$month, '(?<=_)\\d+') %>% as.numeric()
}
if (months_ts){
# with different months
r2 <- summary(lm(ET~ET_pred, data=fallow))$r.squared
Bias <- mean(fallow$ET_pred - fallow$ET, na.rm=TRUE)
ggplot(fallow) +
geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
# annotate("text", x=4, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) +
xlab("Predicted natural ET (mm/day)") +
ylab("Observed fallow ET (mm/day)") +
xlim(c(0,6)) +
ylim(c(0,6))
}
## Warning: Removed 242004 rows containing missing values (geom_point).
print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.512 Bias: 2.836"
if (months_ts){
# averaging over different months
fallow_year <- fallow %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
fallow_year <- fallow
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=fallow_year))$r.squared
Bias <- mean(fallow_year$ET_pred - fallow_year$ET, na.rm=TRUE)
ggplot(fallow_year) +
geom_jitter(aes(x=ET_pred, y=ET), alpha=0.2, size =.1) +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
# annotate("text", x=3, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) +
xlab("Predicted natural ET (mm/day)") +
ylab("Observed fallow ET (mm/day)") +
xlim(c(0,4)) +
ylim(c(0,4))
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(diff(sort(x))): no non-missing arguments to min; returning Inf
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced
## Warning: Removed 20473 rows containing missing values (geom_point).
print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.534 Bias: 2.836"
if (months_ts){
# plot out the time series
fallow_ts <- fallow %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
ggplot(fallow_ts) +
geom_line(aes(x=month, y=ET), color="red") +
geom_line(aes(x=month, y=ET_pred), color="black")
}
Here I just got curious about what ag ET vs fallow ET look like. Seems like fallow ET is probably artificially high.
# ag <- fread(here("data", "3_for_counterfactual", "agriculture", "agriculture.csv"))
# ag$month <- str_extract(ag$month, '(?<=_)\\d+') %>% as.numeric()
#
# # plot out the time series
# ag_ts <- ag %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE))
#
# ggplot(ag_ts) +
# geom_line(aes(x=month, y=ET), color="red") +
# geom_line(data=fallow_ts, aes(x=month, y=ET), color="black")
# read in the spatial_CV predictions
cv <- fread(here(experiment_path, "crossval_predictions_test.csv"))
cv[cv == -9999] <- NA # this is the na value
if (months_ts){
# get a nice column of numeric months
months <- select(cv, names(cv)[grepl("month", names(cv))])
cv$month <- names(months)[max.col(months)]
cv$month <- str_extract(cv$month, '(?<=_)\\d+') %>% as.numeric()
}
cv$mean_dist <- cv$fold_size/6 # the average distance between a held out point and the border to the hold out
if (months_ts){
gridded_cv_metrics <- cv %>%
group_by(x, y, cv_fold, fold_size) %>%
summarize(ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
group_by(cv_fold, fold_size) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
Bias = mean(ET_pred - ET, na.rm = TRUE),
RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
gridded_cv_metrics <- cv %>%
group_by(cv_fold, fold_size) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
Bias = mean(ET_pred - ET, na.rm = TRUE),
RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'cv_fold'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cv_fold'. You can override using the `.groups` argument.
Turn the cv fold back into something meaningful
# first turn "fold" into x and y coordinates
gridded_cv_metrics$x <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "[-.0-9]*(?=,)")) + gridded_cv_metrics$fold_size/200000
gridded_cv_metrics$y <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "(?<=,)[-.0-9]*")) + gridded_cv_metrics$fold_size/200000
# study area and counties for plotting
study_area <- st_read(study_area_loc) %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `study_area' from data source
## `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/study_area/study_area.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 3 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -218673.3 ymin: -352229.6 xmax: 135464.2 ymax: 256511.5
## Projected CRS: NAD83 / California Albers
counties <- st_read(counties_loc) %>% filter(STATEFP == "06") %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `cb_2018_us_county_500k' from data source
## `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/counties/cb_2018_us_county_500k.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1489 ymin: -14.5487 xmax: 179.7785 ymax: 71.36516
## Geodetic CRS: NAD83
gridded_cv_metrics$RMSE <- ifelse(gridded_cv_metrics$RMSE>1.5, 1.5, gridded_cv_metrics$RMSE)
gridded_cv_metrics$Bias <- ifelse(gridded_cv_metrics$Bias>1, 1, gridded_cv_metrics$Bias)
gridded_cv_metrics$Bias <- ifelse(gridded_cv_metrics$Bias<(-1), -1, gridded_cv_metrics$Bias)
filter(gridded_cv_metrics) %>%
ggplot() +
geom_sf(data = counties, color=alpha("white",1), size = .2) +
geom_raster(aes(x=x, y=y, fill=Bias)) +
facet_wrap(vars(fold_size/1000)) +
scale_fill_distiller(palette="Spectral", direction = -1, limits = c(-1,1)) +
geom_sf(data = study_area, fill=alpha("red",0), color = "black", size = .2) +
theme_classic() +
xlim(c(-122.92, -118)) +
ylim(c(34.7, 40.754)) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
## Warning: Raster pixels are placed at uneven horizontal intervals and will be
## shifted. Consider using geom_tile() instead.
## Warning: Raster pixels are placed at uneven vertical intervals and will be
## shifted. Consider using geom_tile() instead.
filter(gridded_cv_metrics) %>%
ggplot() +
geom_sf(data = counties, color=alpha("white",1), size = .2) +
geom_raster(aes(x=x, y=y, fill=RMSE)) +
facet_wrap(vars(fold_size/1000)) +
scale_fill_distiller(palette="Spectral", direction = -1, limits = c(0,1.5)) +
geom_sf(data = study_area, fill=alpha("red",0), color = "black", size = .2) +
theme_classic() +
xlim(c(-122.92, -118)) +
ylim(c(34.7, 40.754)) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
## Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted. Consider using geom_tile() instead.
## Raster pixels are placed at uneven vertical intervals and will be shifted. Consider using geom_tile() instead.
First get the distribution of distances between ag and natural data (specifically the test data)
dist_filename <- here(distance_distribution_path, paste0(dataset, ".csv"))
if (file.exists(dist_filename)){
library(parallel)
library(tictoc)
dist <- fread(dist_filename)
} else {
stop('No distance file for this dataset. Make one using 3_analysis/1_additional_data_manipulation/2_ag_natural_distance.R')
}
##
## Attaching package: 'tictoc'
## The following object is masked from 'package:data.table':
##
## shift
if (months_ts){
cv_metrics <- cv %>%
group_by(x, y, fold_size, mean_dist) %>%
summarize(ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
group_by(fold_size, mean_dist) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
Bias = mean(ET_pred - ET, na.rm = TRUE),
RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
cv_metrics <- cv %>%
group_by(fold_size, mean_dist) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
Bias = mean(ET_pred - ET, na.rm = TRUE),
RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'fold_size'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'fold_size'. You can override using the `.groups` argument.
coef = 2/5
ggplot(NULL) +
stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) +
geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) +
theme_bw() +
theme(axis.title.x = element_text(vjust=-2.5)) +
xlim(c(0, max(cv_metrics$mean_dist/1000))) +
scale_y_continuous(
name = "Distribution of agricultural pixels",
sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))
## Warning: Removed 3 rows containing non-finite values (stat_density).
if (months_ts){
cv_metrics <- cv %>%
group_by(fold_size, month, mean_dist) %>%
summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
Bias = mean(ET_pred - ET, na.rm = TRUE),
RMSE = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
ET = mean(ET, na.rm = TRUE),
ET_pred = mean(ET_pred, na.rm = TRUE))
coef = 2/5
ggplot(NULL) +
stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) +
geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) +
theme_bw() +
theme(axis.title.x = element_text(vjust=-2.5)) +
xlim(c(0, max(cv_metrics$mean_dist/1000))) +
scale_y_continuous(
name = "Distribution of agricultural pixels",
sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))
}
## `summarise()` has grouped output by 'fold_size', 'month'. You can override using
## the `.groups` argument.
## Warning: Removed 3 rows containing non-finite values (stat_density).
Make these using the CV with the fold size that is closest to the double of the mean distance between ag and natural (a bit conservative but only slightly)
mean_dist <- mean(dist$mindist)
folds <- unique(cv$mean_dist)
best_fold <- folds[abs(folds-(mean_dist)) == min(abs(folds-(mean_dist)))]
cv_best_fold <- filter(cv, mean_dist == best_fold)
print(cv_best_fold$fold_size %>% unique())
## [1] 10000
if (months_ts){
# with different months
r2 <- summary(lm(ET~ET_pred, data=cv_best_fold))$r.squared
Bias <- mean(cv_best_fold$ET_pred - cv_best_fold$ET, na.rm=TRUE)
ggplot(cv_best_fold) +
geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
# annotate("text", x=4.5, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) +
xlab("Predicted natural ET (mm/day)") +
ylab("Observed fallow ET (mm/day)") +
xlim(c(0,8)) +
ylim(c(0,8))
}
## Warning: Removed 37436196 rows containing missing values (geom_point).
print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.63 Bias: 0.055"
if (months_ts){
# averaging over different months
cv_best_fold_year <- cv_best_fold %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
cv_best_fold_year <- cv_best_fold
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=cv_best_fold_year))$r.squared
Bias <- mean(cv_best_fold_year$ET_pred - cv_best_fold_year$ET, na.rm=TRUE)
ggplot(cv_best_fold_year) +
geom_jitter(aes(x=ET_pred, y=ET), alpha=0.02, size =.01) +
theme_classic() +
geom_abline(intercept=0,slope=1, color="red") +
# annotate("text", x=4.5, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(Bias, 3))) +
xlab("Predicted natural ET (mm/day)") +
ylab("Observed fallow ET (mm/day)") +
xlim(c(0,6)) +
ylim(c(0,6))
## Warning: Removed 3337080 rows containing missing values (geom_point).
print(paste("R2:", round(r2, 3), "Bias:", round(Bias, 3)))
## [1] "R2: 0.633 Bias: 0.055"
if (months_ts){
# plot out the time series
cv_best_fold_ts <- cv_best_fold %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE),
ET_pred = mean(ET_pred, na.rm=TRUE))
ggplot(cv_best_fold_ts) +
geom_line(aes(x=month, y=ET), color="red") +
geom_line(aes(x=month, y=ET_pred), color="black") +
geom_line(data = fallow_ts, aes(x=month, y=ET), color="blue") +
geom_line(data = fallow_ts, aes(x=month, y=ET_pred), color="green")
}